Objective : visualize and make the right decisions to clean the data set.
import PIL
from PIL import Image
import glob
import sys
import os
import datetime
import pandas as pd
import pandas_profiling
from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications import imagenet_utils
from IPython.display import Image
from tensorflow import keras
from matplotlib import pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
FOLDER_RD = "/mnt/data/nahuel/images/"
FOLDER_P1_TRAIN = "/mnt/data/dataset_v1/train"
FOLDER_P2_TRAIN = "/mnt/data/dataset_v2/train"
FOLDER_P2_TEST = "/mnt/data/dataset_v2/test"
FOLDER_P3_TRAIN = "/mnt/data/dataset_v3/train"
FOLDER_P3_TEST = "/mnt/data/dataset_v3/test"
def quick_plot(data, kind,alpha,cmap ='RdBu_r',):
'''return grapphic plot and describe data'''
plt.style.use('ggplot')
sns.set_style('darkgrid')
plt.xlabel('Brand')
plt.ylabel('Count')
plt.title('Class distribution')
data.groupby("brand")["model"].count().sort_values(ascending=False).plot(kind=kind,
alpha=alpha,
cmap='RdBu_r',
figsize =(15,6),
legend=True,
linewidth=2.0)
data_group = pd.DataFrame(data.groupby('brand')["model"].count().sort_values(ascending=False),
columns=['model'])
plt.show()
print(data_group.T ,'\n')
print(data_group.describe().T)
def display_one_image(dir_,image):
'''print type , shape , min max ,
px, path and dir count_inage'''
data_dir = dir_
image_count = len(list(data_dir.glob('*/*.jpg')))
img = image
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(img_rgb)
plt.show()
print(f'Image type: {type(img)}')
print(f'Image shape: {img.shape} (Height x Width x Channels)')
print(f'Image min value: {img.min()}')
print(f'Image max value: {img.max()}')
# We can access to pixel values
px = img[100,100]
print(px)
print('Path: ',data_dir)
print('Total images: ',image_count)
def display_images(img_folder,size=(20,20)):
'''Check the number of:
1. Samples 5 random images.
2. Samples Number of images'''
plt.figure(figsize=size)
for i in range(5):
file = random.choice(os.listdir(img_folder))
image_path= os.path.join(img_folder, file)
img=mpimg.imread(image_path)
ax=plt.subplot(1,5,i+1)
plt.imshow(img)
initial_count = 0
for path in os.listdir(img_folder):
if os.path.isfile(os.path.join(img_folder, path)):
initial_count += 1
print("Total images: ", initial_count)
def create_data(image_folder , n = 0):
''' Passing the image_folder returns
a dataframe with filename ,
brand, model ,
if c ==1 year'''
filenames = []
for class_name in os.listdir(image_folder):
class_folder = os.path.join(image_folder, class_name)
for image in os.listdir(class_folder):
filename = os.path.join(class_folder, image)
filenames.append(filename)
data = pd.DataFrame(filenames, columns=['filename'])
data['filename'].str.split("/")
data['brand'] = data['filename'].str.split("/").str[-2].str.split("_").str[0]
data['model'] = data['filename'].str.split("/").str[-2]
if (n == 1):
data['model'] = data['filename'].str.split('/').str[-2].str.split('_').str[1]
data['year'] = data['filename'].str.split('/').str[-2].str.split('_').str[2]
return data
def plot_multi_boxplot_swarmpplot(data, size = (15,8)):
'''show boxplot and swarmplot'''
plt.style.use('ggplot')
sns.set_style('darkgrid')
plt.figure(figsize=size)
sns.swarmplot(data = data.groupby("brand").count(),
split = True,
color ='white',
linewidth=2.0,
orient='h',
alpha=0.9)
sns.boxplot(data = data.groupby("brand").count(),
palette ='RdBu_r',
orient='h',
linewidth=2.0)
plt.xlabel('Count')
plt.title('Data distribution')
plt.show()
print(data.groupby("brand").count().T)
def Stacking_graphs(data,class_name,k=True):
'''show plot stack'''
plt.style.use('ggplot')
sns.set_style('darkgrid')
plt.figure(figsize=(15,5))
plt.title('Stacking Per ' + class_name)
plt.xlabel('')
sns.histplot(data.groupby(class_name).count(),
palette ='RdBu_r',
linewidth=3.0 ,
bins= 30,
legend=True,
stat ='count',
multiple="stack",
kde=k,
alpha=0.80)
plt.show()
def plot_matrix(data,class_name,n=1):
'''show heatmap'''
plt.style.use('ggplot')
sns.set_style('darkgrid')
if (n==1):
plt.figure(figsize=(10,5))
plt.title('Correlation matrix')
sns.heatmap(data.groupby(class_name).count().corr(),
annot=True,
fmt='.1f',
linewidth=.3,
cmap='rocket_r')
else :
fig, ax = plt.subplots(figsize = (12,12))
ax.set(xlabel="", ylabel="")
ax.xaxis.tick_top()
sns.heatmap(data.groupby(class_name).count(),
cmap='rocket_r',
annot=False , fmt=".1f",linewidth=.5)
plt.show()
def univ_dist(data,name,size=(5,5)):
'''barplot
distributions'''
plt.style.use('ggplot')
sns.set_style('darkgrid')
plt.figure(figsize=size)
plt.xlabel('Count')
plt.title('Absolute frequencies of ' + name)
pd.value_counts(data[name]).sort_values(ascending=False).head(25).plot(kind='barh',
cmap ='RdBu_r',
linewidth=3.0,
alpha=0.75,
width=1)
plt.show()
if (name != 'filename'):
print(pd.DataFrame(data.groupby(name)["model"].count().sort_values(ascending=False).head(25),
columns=['model']).T)
pd.DataFrame(100 * data[name].value_counts() / len(data[name])).sort_values(by = [name],
ascending=(False)).head(25).plot(kind='barh',
figsize=(5,5),
cmap ='tab20',
linewidth=3.0,
alpha=0.6,
width=1)
plt.xlabel('Percentage (%)')
plt.title('Relative frequencies of '+ name)
plt.show()
if (name != 'filename'):
print(pd.DataFrame(100 * data[name].value_counts() / len(data[name])).sort_values(by = [name],
ascending=(False)).head(25).T)
def info_data(data):
'''show info '''
print(data.info(),'\n')
print(data.isnull().sum(),'\n')
quick_plot(data, kind="area",alpha=0.3)
quick_plot(data, kind="bar",alpha=0.75)
def all_class_plots(data):
'''show bar plot'''
plt.figure(figsize=(15,15))
plt.title('Distribution of all brands')
cat_totals = data.groupby("brand")["model"].count().sort_values()
cat_totals.plot(kind="barh", fontsize=8,alpha=0.75,cmap ='RdBu_r',width=0.8)
plt.show()
def Samples_info(data_dir_train,data_dir_test):
'''return tuple size train, test, and total'''
count_train = len(list(data_dir_train.glob('*/*.jpg')))
count_test = len(list(data_dir_test.glob('*/*.jpg')))
total = count_train + count_test
return count_train , count_test , total
def convert_data(data):
'''return dataframe transpose'''
data = pd.DataFrame(data).T
data.rename(columns={0:'Train',
1:'Test',
2:'Total'},
inplace=True)
return data
/usr/lib/python3/dist-packages/OpenSSL/crypto.py:12: CryptographyDeprecationWarning: Python 3.6 is no longer supported by the Python core team. Therefore, support for it is deprecated in cryptography and will be removed in a future release. from cryptography import x509
1.2.1 Download & Preprocessing
The notebooks used will be available in the project's notebook folder, as well as a link to Gitlab dataset_cleaning.
Random display of the first 5 images.
img_folder=r'/mnt/data/nahuel/images/abarth_124-spider_2016/'
display_images(img_folder)
Total images: 243
data_dir = pathlib.Path(r'/mnt/data/nahuel/images/')
img = cv2.imread('/mnt/data/nahuel/images/abarth_124-spider_2016/905_7_7d71c74b-ed3d-c603-e053-e250040a6889_f26b7bc7-95ff-4012-bf97-e386e657bf48.jpg')
display_one_image(data_dir,img)
Image type: <class 'numpy.ndarray'> Image shape: (480, 640, 3) (Height x Width x Channels) Image min value: 0 Image max value: 255 [77 74 66] Path: /mnt/data/nahuel/images Total images: 1631955
Random display of the first 5 images.
img_folder=r'/mnt/data/dataset_v1/train/porsche_911/'
display_images(img_folder)
Total images: 200
data_dir_train = pathlib.Path(r'/mnt/data/dataset_v1/train/')
img = cv2.imread('/mnt/data/dataset_v1/train/porsche_911/39978_3_8efe016c-d00f-c14d-e053-e350040a0db0_862dacbd-567c-47d6-aede-8583ebea3cfe.jpg')
display_one_image(data_dir_train,img)
Image type: <class 'numpy.ndarray'> Image shape: (236, 433, 3) (Height x Width x Channels) Image min value: 0 Image max value: 255 [82 79 75] Path: /mnt/data/dataset_v1/train Total images: 130197
Random display of the first 5 images.
img_folder=r'/mnt/data/dataset_v2/train/porsche_911/'
display_images(img_folder)
Total images: 171
data_dir_train = pathlib.Path(r'/mnt/data/dataset_v2/train/')
img = cv2.imread('/mnt/data/dataset_v2/train/porsche_911/39976_3_f0006f61-3068-164e-e053-e350040aa153_a0170772-0984-427d-a309-767eb942eeef.jpg')
display_one_image(data_dir_train,img)
Image type: <class 'numpy.ndarray'> Image shape: (346, 455, 3) (Height x Width x Channels) Image min value: 0 Image max value: 255 [137 155 156] Path: /mnt/data/dataset_v2/train Total images: 65312
data_dir_test = pathlib.Path(r'/mnt/data/dataset_v2/test/')
img = cv2.imread('/mnt/data/dataset_v2/test/porsche_911/39870_2_9113c4a3-3f67-4d27-8611-6fec78450fcd_2b1542cc-fcbe-495a-a650-8a19f54dfef4.jpg')
display_one_image(data_dir_test,img)
Image type: <class 'numpy.ndarray'> Image shape: (236, 496, 3) (Height x Width x Channels) Image min value: 0 Image max value: 255 [172 166 159] Path: /mnt/data/dataset_v2/test Total images: 7257
P2_samples = convert_data(Samples_info(data_dir_train,data_dir_test))
P2_samples
| Train | Test | Total | |
|---|---|---|---|
| 0 | 65312 | 7257 | 72569 |
The total samples are 72569 for P(2).
Random display of the first 5 images.
img_folder=r'/mnt/data/dataset_v3/train/porsche_900_series/'
display_images(img_folder)
Total images: 346
data_dir_train = pathlib.Path(r'/mnt/data/dataset_v3/train/')
img = cv2.imread('/mnt/data/dataset_v3/train/porsche_900_series/40480_14_680571fe-d8db-422a-94c1-c0a3b66ea2a9_193880c5-2cb4-47bf-9833-ceeb13c79d20.jpg')
display_one_image(data_dir_train,img)
Image type: <class 'numpy.ndarray'> Image shape: (383, 620, 3) (Height x Width x Channels) Image min value: 0 Image max value: 255 [247 252 251] Path: /mnt/data/dataset_v3/train Total images: 65312
data_dir_test = pathlib.Path(r'/mnt/data/dataset_v3/test/')
img = cv2.imread('/mnt/data/dataset_v3/test/lexus_ct-200h/20595_7_549802ba-ce39-47f5-b8b4-86fece3d964d_5222cb28-8ca3-4d1a-93bc-40b6ab26384d.jpg')
display_one_image(data_dir_test,img)
Image type: <class 'numpy.ndarray'> Image shape: (400, 547, 3) (Height x Width x Channels) Image min value: 0 Image max value: 255 [25 12 10] Path: /mnt/data/dataset_v3/test Total images: 7257
P3_samples = convert_data(Samples_info(data_dir_train,data_dir_test))
P3_samples
| Train | Test | Total | |
|---|---|---|---|
| 0 | 65312 | 7257 | 72569 |
The total samples are 72569 for P(3).
Extraction of a list of classes and storage in CSV from the image directory.
df_RD = create_data(FOLDER_RD,1)
df_RD.head()
| filename | brand | model | year | |
|---|---|---|---|---|
| 0 | /mnt/data/nahuel/images/abarth_124-spider_2016... | abarth | 124-spider | 2016 |
| 1 | /mnt/data/nahuel/images/abarth_124-spider_2016... | abarth | 124-spider | 2016 |
| 2 | /mnt/data/nahuel/images/abarth_124-spider_2016... | abarth | 124-spider | 2016 |
| 3 | /mnt/data/nahuel/images/abarth_124-spider_2016... | abarth | 124-spider | 2016 |
| 4 | /mnt/data/nahuel/images/abarth_124-spider_2016... | abarth | 124-spider | 2016 |
df_RD.nunique()
filename 1632113 brand 57 model 823 year 19 dtype: int64
info_data(df_RD)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1632113 entries, 0 to 1632112 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 filename 1632113 non-null object 1 brand 1632113 non-null object 2 model 1632113 non-null object 3 year 1632113 non-null object dtypes: object(4) memory usage: 49.8+ MB None filename 0 brand 0 model 0 year 0 dtype: int64
brand mercedes-benz bmw volkswagen audi fiat citroen renault \
model 230990 147449 130423 81760 72367 68487 68458
brand ford mini opel ... aston-martin tesla hummer isuzu mg \
model 67695 64446 59877 ... 2214 2206 1532 1448 1038
brand ligier rover corvette lada cadillac
model 753 603 287 272 253
[1 rows x 57 columns]
count mean std min 25% 50% 75% \
model 57.0 28633.561404 41410.185854 253.0 4854.0 13861.0 31336.0
max
model 230990.0
brand mercedes-benz bmw volkswagen audi fiat citroen renault \
model 230990 147449 130423 81760 72367 68487 68458
brand ford mini opel ... aston-martin tesla hummer isuzu mg \
model 67695 64446 59877 ... 2214 2206 1532 1448 1038
brand ligier rover corvette lada cadillac
model 753 603 287 272 253
[1 rows x 57 columns]
count mean std min 25% 50% 75% \
model 57.0 28633.561404 41410.185854 253.0 4854.0 13861.0 31336.0
max
model 230990.0
df_P1 = create_data(FOLDER_P1_TRAIN)
df_P1.head()
| filename | brand | model | |
|---|---|---|---|
| 0 | /mnt/data/dataset_v1/train/chevrolet_captiva/3... | chevrolet | chevrolet_captiva |
| 1 | /mnt/data/dataset_v1/train/chevrolet_captiva/2... | chevrolet | chevrolet_captiva |
| 2 | /mnt/data/dataset_v1/train/chevrolet_captiva/2... | chevrolet | chevrolet_captiva |
| 3 | /mnt/data/dataset_v1/train/chevrolet_captiva/1... | chevrolet | chevrolet_captiva |
| 4 | /mnt/data/dataset_v1/train/chevrolet_captiva/2... | chevrolet | chevrolet_captiva |
df_P1.nunique()
filename 130200 brand 54 model 651 dtype: int64
info_data(df_P1)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 130200 entries, 0 to 130199 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 filename 130200 non-null object 1 brand 130200 non-null object 2 model 130200 non-null object dtypes: object(3) memory usage: 3.0+ MB None filename 0 brand 0 model 0 dtype: int64
brand mercedes-benz bmw volkswagen audi fiat ford peugeot citroen \
model 22000 11600 8800 6000 5400 5200 4800 4600
brand mini nissan ... saab rover isuzu iveco ligier caravans-wohnm \
model 4600 4200 ... 400 200 200 200 200 200
brand mg bentley tesla hummer
model 200 200 200 200
[1 rows x 54 columns]
count mean std min 25% 50% 75% max
model 54.0 2411.111111 3528.090331 200.0 600.0 1200.0 2750.0 22000.0
brand mercedes-benz bmw volkswagen audi fiat ford peugeot citroen \
model 22000 11600 8800 6000 5400 5200 4800 4600
brand mini nissan ... saab rover isuzu iveco ligier caravans-wohnm \
model 4600 4200 ... 400 200 200 200 200 200
brand mg bentley tesla hummer
model 200 200 200 200
[1 rows x 54 columns]
count mean std min 25% 50% 75% max
model 54.0 2411.111111 3528.090331 200.0 600.0 1200.0 2750.0 22000.0
df_P2_train = create_data(FOLDER_P2_TRAIN)
df_P2_train.head()
| filename | brand | model | |
|---|---|---|---|
| 0 | /mnt/data/dataset_v2/train/chevrolet_captiva/3... | chevrolet | chevrolet_captiva |
| 1 | /mnt/data/dataset_v2/train/chevrolet_captiva/2... | chevrolet | chevrolet_captiva |
| 2 | /mnt/data/dataset_v2/train/chevrolet_captiva/2... | chevrolet | chevrolet_captiva |
| 3 | /mnt/data/dataset_v2/train/chevrolet_captiva/1... | chevrolet | chevrolet_captiva |
| 4 | /mnt/data/dataset_v2/train/chevrolet_captiva/2... | chevrolet | chevrolet_captiva |
df_P2_train.nunique()
filename 65312 brand 43 model 387 dtype: int64
info_data(df_P2_train)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 65312 entries, 0 to 65311 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 filename 65312 non-null object 1 brand 65312 non-null object 2 model 65312 non-null object dtypes: object(3) memory usage: 1.5+ MB None filename 0 brand 0 model 0 dtype: int64
brand mercedes-benz volkswagen bmw audi fiat mini peugeot citroen \
model 7273 5338 5000 3764 3727 3085 3031 2865
brand ford renault ... ssangyong toyota daihatsu ds-automobiles lexus \
model 2573 2497 ... 352 345 344 343 324
brand ferrari bentley maserati iveco dodge
model 174 170 169 105 96
[1 rows x 43 columns]
count mean std min 25% 50% 75% max
model 43.0 1518.883721 1589.650384 96.0 431.0 905.0 2194.5 7273.0
brand mercedes-benz volkswagen bmw audi fiat mini peugeot citroen \
model 7273 5338 5000 3764 3727 3085 3031 2865
brand ford renault ... ssangyong toyota daihatsu ds-automobiles lexus \
model 2573 2497 ... 352 345 344 343 324
brand ferrari bentley maserati iveco dodge
model 174 170 169 105 96
[1 rows x 43 columns]
count mean std min 25% 50% 75% max
model 43.0 1518.883721 1589.650384 96.0 431.0 905.0 2194.5 7273.0
df_P2_test = create_data(FOLDER_P2_TEST)
df_P2_test.head()
| filename | brand | model | |
|---|---|---|---|
| 0 | /mnt/data/dataset_v2/test/bmw_650/6362_1_43583... | bmw | bmw_650 |
| 1 | /mnt/data/dataset_v2/test/bmw_650/6366_6_f378a... | bmw | bmw_650 |
| 2 | /mnt/data/dataset_v2/test/bmw_650/6358_4_35abb... | bmw | bmw_650 |
| 3 | /mnt/data/dataset_v2/test/bmw_650/6386_1_95445... | bmw | bmw_650 |
| 4 | /mnt/data/dataset_v2/test/bmw_650/6408_4_57e9a... | bmw | bmw_650 |
df_P2_test.nunique()
filename 7257 brand 43 model 387 dtype: int64
info_data(df_P2_test)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7257 entries, 0 to 7256 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 filename 7257 non-null object 1 brand 7257 non-null object 2 model 7257 non-null object dtypes: object(3) memory usage: 170.2+ KB None filename 0 brand 0 model 0 dtype: int64
brand mercedes-benz volkswagen bmw fiat audi mini peugeot citroen \
model 826 574 552 418 398 349 338 307
brand ford opel ... daihatsu lexus ds-automobiles ssangyong saab \
model 290 266 ... 42 39 38 35 30
brand maserati ferrari bentley iveco dodge
model 20 18 18 13 9
[1 rows x 43 columns]
count mean std min 25% 50% 75% max
model 43.0 168.767442 176.309905 9.0 53.5 87.0 241.5 826.0
brand mercedes-benz volkswagen bmw fiat audi mini peugeot citroen \
model 826 574 552 418 398 349 338 307
brand ford opel ... daihatsu lexus ds-automobiles ssangyong saab \
model 290 266 ... 42 39 38 35 30
brand maserati ferrari bentley iveco dodge
model 20 18 18 13 9
[1 rows x 43 columns]
count mean std min 25% 50% 75% max
model 43.0 168.767442 176.309905 9.0 53.5 87.0 241.5 826.0
join_df_P2 = pd.concat([df_P2_train, df_P2_test], axis=0)
join_df_P2.nunique()
filename 72569 brand 43 model 387 dtype: int64
info_data(join_df_P2)
<class 'pandas.core.frame.DataFrame'> Int64Index: 72569 entries, 0 to 7256 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 filename 72569 non-null object 1 brand 72569 non-null object 2 model 72569 non-null object dtypes: object(3) memory usage: 2.2+ MB None filename 0 brand 0 model 0 dtype: int64
brand mercedes-benz volkswagen bmw audi fiat mini peugeot citroen \
model 8099 5912 5552 4162 4145 3434 3369 3172
brand ford renault ... ssangyong daihatsu saab ds-automobiles lexus \
model 2863 2749 ... 387 386 382 381 363
brand ferrari maserati bentley iveco dodge
model 192 189 188 118 105
[1 rows x 43 columns]
count mean std min 25% 50% 75% max
model 43.0 1687.651163 1765.634108 105.0 483.0 987.0 2443.0 8099.0
brand mercedes-benz volkswagen bmw audi fiat mini peugeot citroen \
model 8099 5912 5552 4162 4145 3434 3369 3172
brand ford renault ... ssangyong daihatsu saab ds-automobiles lexus \
model 2863 2749 ... 387 386 382 381 363
brand ferrari maserati bentley iveco dodge
model 192 189 188 118 105
[1 rows x 43 columns]
count mean std min 25% 50% 75% max
model 43.0 1687.651163 1765.634108 105.0 483.0 987.0 2443.0 8099.0
df_P3_train = create_data(FOLDER_P3_TRAIN)
df_P3_train.head()
| filename | brand | model | |
|---|---|---|---|
| 0 | /mnt/data/dataset_v3/train/chevrolet_cruze/971... | chevrolet | chevrolet_cruze |
| 1 | /mnt/data/dataset_v3/train/chevrolet_cruze/949... | chevrolet | chevrolet_cruze |
| 2 | /mnt/data/dataset_v3/train/chevrolet_cruze/855... | chevrolet | chevrolet_cruze |
| 3 | /mnt/data/dataset_v3/train/chevrolet_cruze/857... | chevrolet | chevrolet_cruze |
| 4 | /mnt/data/dataset_v3/train/chevrolet_cruze/961... | chevrolet | chevrolet_cruze |
df_P3_train.nunique()
filename 65312 brand 43 model 273 dtype: int64
info_data(df_P3_train)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 65312 entries, 0 to 65311 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 filename 65312 non-null object 1 brand 65312 non-null object 2 model 65312 non-null object dtypes: object(3) memory usage: 1.5+ MB None filename 0 brand 0 model 0 dtype: int64
brand mercedes-benz volkswagen bmw audi fiat mini peugeot citroen \
model 7273 5338 5000 3764 3206 3085 3031 2865
brand ford renault ... saab toyota daihatsu ds-automobiles lexus \
model 2573 2497 ... 352 345 344 343 324
brand ferrari bentley maserati iveco dodge
model 174 170 169 105 96
[1 rows x 43 columns]
count mean std min 25% 50% 75% max
model 43.0 1518.883721 1568.471188 96.0 433.0 1025.0 2194.5 7273.0
brand mercedes-benz volkswagen bmw audi fiat mini peugeot citroen \
model 7273 5338 5000 3764 3206 3085 3031 2865
brand ford renault ... saab toyota daihatsu ds-automobiles lexus \
model 2573 2497 ... 352 345 344 343 324
brand ferrari bentley maserati iveco dodge
model 174 170 169 105 96
[1 rows x 43 columns]
count mean std min 25% 50% 75% max
model 43.0 1518.883721 1568.471188 96.0 433.0 1025.0 2194.5 7273.0
df_P3_test = create_data(FOLDER_P3_TEST)
df_P3_test.nunique()
filename 7257 brand 43 model 273 dtype: int64
info_data(df_P3_test)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7257 entries, 0 to 7256 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 filename 7257 non-null object 1 brand 7257 non-null object 2 model 7257 non-null object dtypes: object(3) memory usage: 170.2+ KB None filename 0 brand 0 model 0 dtype: int64
brand mercedes-benz volkswagen bmw audi fiat mini peugeot citroen \
model 826 574 552 398 357 349 338 307
brand ford opel ... daihatsu lexus ds-automobiles ssangyong saab \
model 290 266 ... 42 39 38 35 30
brand maserati bentley ferrari iveco dodge
model 20 18 18 13 9
[1 rows x 43 columns]
count mean std min 25% 50% 75% max
model 43.0 168.767442 173.979725 9.0 53.5 96.0 241.5 826.0
brand mercedes-benz volkswagen bmw audi fiat mini peugeot citroen \
model 826 574 552 398 357 349 338 307
brand ford opel ... daihatsu lexus ds-automobiles ssangyong saab \
model 290 266 ... 42 39 38 35 30
brand maserati bentley ferrari iveco dodge
model 20 18 18 13 9
[1 rows x 43 columns]
count mean std min 25% 50% 75% max
model 43.0 168.767442 173.979725 9.0 53.5 96.0 241.5 826.0
join_df_P3 = pd.concat([df_P3_train, df_P3_test], axis=0)
join_df_P3.nunique()
filename 72569 brand 43 model 273 dtype: int64
info_data(join_df_P3)
<class 'pandas.core.frame.DataFrame'> Int64Index: 72569 entries, 0 to 7256 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 filename 72569 non-null object 1 brand 72569 non-null object 2 model 72569 non-null object dtypes: object(3) memory usage: 2.2+ MB None filename 0 brand 0 model 0 dtype: int64
brand mercedes-benz volkswagen bmw audi fiat mini peugeot citroen \
model 8099 5912 5552 4162 3563 3434 3369 3172
brand ford renault ... ssangyong daihatsu saab ds-automobiles lexus \
model 2863 2749 ... 387 386 382 381 363
brand ferrari maserati bentley iveco dodge
model 192 189 188 118 105
[1 rows x 43 columns]
count mean std min 25% 50% 75% max
model 43.0 1687.651163 1742.113192 105.0 483.0 1148.0 2443.0 8099.0
brand mercedes-benz volkswagen bmw audi fiat mini peugeot citroen \
model 8099 5912 5552 4162 3563 3434 3369 3172
brand ford renault ... ssangyong daihatsu saab ds-automobiles lexus \
model 2863 2749 ... 387 386 382 381 363
brand ferrari maserati bentley iveco dodge
model 192 189 188 118 105
[1 rows x 43 columns]
count mean std min 25% 50% 75% max
model 43.0 1687.651163 1742.113192 105.0 483.0 1148.0 2443.0 8099.0
plot_multi_boxplot_swarmpplot(df_RD)
brand abarth alfa-romeo aston-martin audi bentley bmw cadillac \ filename 11350 17849 2214 81760 4854 147449 253 model 11350 17849 2214 81760 4854 147449 253 year 11350 17849 2214 81760 4854 147449 253 brand caravans-wohnm chevrolet chrysler ... saab seat skoda smart \ filename 2975 15681 10579 ... 4996 31336 23021 11430 model 2975 15681 10579 ... 4996 31336 23021 11430 year 2975 15681 10579 ... 4996 31336 23021 11430 brand ssangyong subaru suzuki tesla toyota volkswagen filename 8084 16632 28999 2206 8032 130423 model 8084 16632 28999 2206 8032 130423 year 8084 16632 28999 2206 8032 130423 [3 rows x 57 columns]
plot_multi_boxplot_swarmpplot(df_P1)
brand abarth alfa-romeo aston-martin audi bentley bmw \ filename 1600 1600 400 6000 200 11600 model 1600 1600 400 6000 200 11600 brand caravans-wohnm chevrolet chrysler citroen ... saab seat \ filename 200 1800 1200 4600 ... 400 2200 model 200 1800 1200 4600 ... 400 2200 brand skoda smart ssangyong subaru suzuki tesla toyota volkswagen filename 1400 1000 800 1200 2600 200 600 8800 model 1400 1000 800 1200 2600 200 600 8800 [2 rows x 54 columns]
plot_multi_boxplot_swarmpplot(join_df_P2)
brand abarth alfa-romeo audi bentley bmw chevrolet chrysler \ filename 586 1356 4162 188 5552 956 764 model 586 1356 4162 188 5552 956 764 brand citroen dacia daihatsu ... renault saab seat skoda smart \ filename 3172 782 386 ... 2749 382 1332 1158 580 model 3172 782 386 ... 2749 382 1332 1158 580 brand ssangyong subaru suzuki toyota volkswagen filename 387 961 1513 389 5912 model 387 961 1513 389 5912 [2 rows x 43 columns]
plot_multi_boxplot_swarmpplot(join_df_P3)
brand abarth alfa-romeo audi bentley bmw chevrolet chrysler \ filename 1168 1356 4162 188 5552 956 764 model 1168 1356 4162 188 5552 956 764 brand citroen dacia daihatsu ... renault saab seat skoda smart \ filename 3172 782 386 ... 2749 382 1332 1158 580 model 3172 782 386 ... 2749 382 1332 1158 580 brand ssangyong subaru suzuki toyota volkswagen filename 387 961 1513 389 5912 model 387 961 1513 389 5912 [2 rows x 43 columns]
all_class_plots(df_RD)
Statistics
df_RD.describe()
| filename | brand | model | year | |
|---|---|---|---|---|
| count | 1632113 | 1632113 | 1632113 | 1632113 |
| unique | 1632113 | 57 | 823 | 19 |
| top | /mnt/data/nahuel/images/lancia_delta_2011/1780... | mercedes-benz | 500 | 2018 |
| freq | 1 | 230990 | 6271 | 129294 |
df_RD.groupby(by = ["brand",'year']).describe()
| filename | model | ||||||||
|---|---|---|---|---|---|---|---|---|---|
| count | unique | top | freq | count | unique | top | freq | ||
| brand | year | ||||||||
| abarth | 2008 | 510 | 510 | /mnt/data/nahuel/images/abarth_500_2008/221_7_... | 1 | 510 | 2 | 500 | 264 |
| 2009 | 514 | 514 | /mnt/data/nahuel/images/abarth_500_2009/220_5_... | 1 | 514 | 2 | grande-punto | 281 | |
| 2010 | 455 | 455 | /mnt/data/nahuel/images/abarth_500_2010/184_2_... | 1 | 455 | 2 | 500 | 312 | |
| 2011 | 759 | 759 | /mnt/data/nahuel/images/abarth_500_2011/157_5_... | 1 | 759 | 3 | 500c | 343 | |
| 2012 | 746 | 746 | /mnt/data/nahuel/images/abarth_500c_2012/305_1... | 1 | 746 | 3 | 500 | 350 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| volkswagen | 2014 | 9251 | 9251 | /mnt/data/nahuel/images/volkswagen_passat-vari... | 1 | 9251 | 35 | jetta | 424 |
| 2015 | 9739 | 9739 | /mnt/data/nahuel/images/volkswagen_golf_2015/2... | 1 | 9739 | 38 | cc | 469 | |
| 2016 | 8551 | 8551 | /mnt/data/nahuel/images/volkswagen_maggiolino_... | 1 | 8551 | 37 | jetta | 420 | |
| 2017 | 7776 | 7776 | /mnt/data/nahuel/images/volkswagen_jetta_2017/... | 1 | 7776 | 32 | golf-variant | 457 | |
| 2018 | 7624 | 7624 | /mnt/data/nahuel/images/volkswagen_crafter_201... | 1 | 7624 | 29 | t-roc | 383 | |
784 rows Ć 8 columns
all_class_plots(df_P1)
Statistics
df_P1.describe()
| filename | brand | model | |
|---|---|---|---|
| count | 130200 | 130200 | 130200 |
| unique | 130200 | 54 | 651 |
| top | /mnt/data/dataset_v1/train/hyundai_terracan/82... | mercedes-benz | subaru_forester |
| freq | 1 | 22000 | 200 |
df_P1.groupby(by = ["brand",'model']).describe()
| filename | |||||
|---|---|---|---|---|---|
| count | unique | top | freq | ||
| brand | model | ||||
| abarth | abarth_124-spider | 200 | 200 | /mnt/data/dataset_v1/train/abarth_124-spider/8... | 1 |
| abarth_500 | 200 | 200 | /mnt/data/dataset_v1/train/abarth_500/211_11_1... | 1 | |
| abarth_500c | 200 | 200 | /mnt/data/dataset_v1/train/abarth_500c/280_6_e... | 1 | |
| abarth_595 | 200 | 200 | /mnt/data/dataset_v1/train/abarth_595/841_5_5e... | 1 | |
| abarth_595-competizione | 200 | 200 | /mnt/data/dataset_v1/train/abarth_595-competiz... | 1 | |
| ... | ... | ... | ... | ... | ... |
| volkswagen | volkswagen_tiguan | 200 | 200 | /mnt/data/dataset_v1/train/volkswagen_tiguan/6... | 1 |
| volkswagen_touareg | 200 | 200 | /mnt/data/dataset_v1/train/volkswagen_touareg/... | 1 | |
| volkswagen_touran | 200 | 200 | /mnt/data/dataset_v1/train/volkswagen_touran/1... | 1 | |
| volkswagen_transporter | 200 | 200 | /mnt/data/dataset_v1/train/volkswagen_transpor... | 1 | |
| volkswagen_up | 200 | 200 | /mnt/data/dataset_v1/train/volkswagen_up/6808_... | 1 | |
651 rows Ć 4 columns
all_class_plots(join_df_P2)
join_df_P2.describe()
| filename | brand | model | |
|---|---|---|---|
| count | 72569 | 72569 | 72569 |
| unique | 72569 | 43 | 387 |
| top | /mnt/data/dataset_v2/train/bmw_530/4193_3_284a... | mercedes-benz | chrysler_voyager |
| freq | 1 | 8099 | 200 |
join_df_P2.groupby(by = ["brand",'model']).describe()
| filename | |||||
|---|---|---|---|---|---|
| count | unique | top | freq | ||
| brand | model | ||||
| abarth | abarth_500 | 197 | 197 | /mnt/data/dataset_v2/train/abarth_500/57_3_a0b... | 1 |
| abarth_500c | 195 | 195 | /mnt/data/dataset_v2/test/abarth_500c/254_7_e8... | 1 | |
| abarth_595-turismo | 194 | 194 | /mnt/data/dataset_v2/train/abarth_595-turismo/... | 1 | |
| alfa-romeo | alfa-romeo_147 | 193 | 193 | /mnt/data/dataset_v2/train/alfa-romeo_147/153_... | 1 |
| alfa-romeo_156 | 196 | 196 | /mnt/data/dataset_v2/train/alfa-romeo_156/783_... | 1 | |
| ... | ... | ... | ... | ... | ... |
| volkswagen | volkswagen_t5-multivan | 187 | 187 | /mnt/data/dataset_v2/train/volkswagen_t5-multi... | 1 |
| volkswagen_tiguan | 187 | 187 | /mnt/data/dataset_v2/train/volkswagen_tiguan/6... | 1 | |
| volkswagen_touareg | 189 | 189 | /mnt/data/dataset_v2/train/volkswagen_touareg/... | 1 | |
| volkswagen_touran | 192 | 192 | /mnt/data/dataset_v2/train/volkswagen_touran/1... | 1 | |
| volkswagen_transporter | 170 | 170 | /mnt/data/dataset_v2/train/volkswagen_transpor... | 1 | |
387 rows Ć 4 columns
all_class_plots(join_df_P3)
join_df_P3.describe()
| filename | brand | model | |
|---|---|---|---|
| count | 72569 | 72569 | 72569 |
| unique | 72569 | 43 | 273 |
| top | /mnt/data/dataset_v3/train/chrysler_300c/787_6... | mercedes-benz | mercedes-benz_e_class |
| freq | 1 | 8099 | 2509 |
join_df_P3.groupby(by = ["brand",'model']).describe()
| filename | |||||
|---|---|---|---|---|---|
| count | unique | top | freq | ||
| brand | model | ||||
| abarth | abarth_500_series | 1168 | 1168 | /mnt/data/dataset_v3/train/abarth_500_series/1... | 1 |
| alfa-romeo | alfa-romeo_147 | 193 | 193 | /mnt/data/dataset_v3/train/alfa-romeo_147/207_... | 1 |
| alfa-romeo_156 | 196 | 196 | /mnt/data/dataset_v3/train/alfa-romeo_156/806_... | 1 | |
| alfa-romeo_159 | 195 | 195 | /mnt/data/dataset_v3/train/alfa-romeo_159/100_... | 1 | |
| alfa-romeo_giulietta | 194 | 194 | /mnt/data/dataset_v3/train/alfa-romeo_giuliett... | 1 | |
| ... | ... | ... | ... | ... | ... |
| volkswagen | volkswagen_sharan | 187 | 187 | /mnt/data/dataset_v3/train/volkswagen_sharan/4... | 1 |
| volkswagen_t5 | 727 | 727 | /mnt/data/dataset_v3/train/volkswagen_t5/9587_... | 1 | |
| volkswagen_tiguan | 187 | 187 | /mnt/data/dataset_v3/train/volkswagen_tiguan/6... | 1 | |
| volkswagen_touareg | 189 | 189 | /mnt/data/dataset_v3/train/volkswagen_touareg/... | 1 | |
| volkswagen_touran | 192 | 192 | /mnt/data/dataset_v3/train/volkswagen_touran/1... | 1 | |
273 rows Ć 4 columns
Stacking Per brand
Stacking_graphs(df_RD,"brand")
Stacking_graphs(df_P1,"brand")
Stacking_graphs(join_df_P2,"brand")
Stacking_graphs(join_df_P3,"brand")
Stacking Per Model
Stacking_graphs(df_RD,"model")
Stacking_graphs(df_P1,"model",k=False)
Stacking_graphs(join_df_P2,"model")
Stacking_graphs(join_df_P3,"model")
plot_matrix(df_RD,"model")
plot_matrix(df_P1,"brand")
plot_matrix(join_df_P2,"model")
plot_matrix(join_df_P3,"model")
plot_matrix(df_RD,"brand", 2)
plot_matrix(df_P1,"brand", 2)
plot_matrix(join_df_P2,"brand", 2)
plot_matrix(join_df_P3,"brand", 2)
univ_dist(df_RD,'filename')
univ_dist(df_P1,'filename')
univ_dist(join_df_P2,'filename')
univ_dist(join_df_P3,'filename')
univ_dist(df_RD,'brand')
brand mercedes-benz bmw volkswagen audi fiat citroen renault \ model 230990 147449 130423 81760 72367 68487 68458 brand ford mini opel ... suzuki mazda mitsubishi porsche \ model 67695 64446 59877 ... 28999 28299 27097 26219 brand land-rover skoda honda jeep alfa-romeo jaguar model 24351 23021 19862 18143 17849 17034 [1 rows x 25 columns]
mercedes-benz bmw volkswagen audi fiat citroen \
brand 14.152819 9.03424 7.991052 5.009457 4.433945 4.196217
renault ford mini opel ... suzuki mazda \
brand 4.19444 4.147691 3.948624 3.66868 ... 1.776776 1.733887
mitsubishi porsche land-rover skoda honda jeep \
brand 1.66024 1.606445 1.491992 1.410503 1.21695 1.111626
alfa-romeo jaguar
brand 1.093613 1.043678
[1 rows x 25 columns]
univ_dist(df_P1,'brand')
brand mercedes-benz volkswagen bmw fiat audi mini peugeot citroen \ model 826 574 552 418 398 349 338 307 brand ford opel ... seat alfa-romeo mitsubishi porsche mazda skoda \ model 290 266 ... 152 139 127 126 115 96 brand chevrolet subaru land-rover jaguar model 87 86 82 77 [1 rows x 25 columns]
mercedes-benz volkswagen bmw fiat audi mini \
brand 11.382114 7.909605 7.606449 5.759956 5.48436 4.80915
peugeot citroen ford opel ... seat alfa-romeo \
brand 4.657572 4.230398 3.996142 3.665426 ... 2.094529 1.915392
mitsubishi porsche mazda skoda chevrolet subaru \
brand 1.750034 1.736255 1.584677 1.322861 1.198842 1.185063
land-rover jaguar
brand 1.129944 1.061045
[1 rows x 25 columns]
univ_dist(join_df_P2,'brand')
brand mercedes-benz volkswagen bmw audi fiat mini peugeot citroen \ model 8099 5912 5552 4162 4145 3434 3369 3172 brand ford renault ... alfa-romeo seat mitsubishi skoda porsche \ model 2863 2749 ... 1356 1332 1319 1158 1151 brand mazda land-rover subaru chevrolet dacia model 1148 987 961 956 782 [1 rows x 25 columns]
mercedes-benz volkswagen bmw audi fiat mini \
brand 11.160413 8.146729 7.65065 5.735231 5.711805 4.732048
peugeot citroen ford renault ... alfa-romeo seat \
brand 4.642478 4.371012 3.945211 3.788119 ... 1.868566 1.835494
mitsubishi skoda porsche mazda land-rover subaru \
brand 1.817581 1.595723 1.586077 1.581943 1.360085 1.324257
chevrolet dacia
brand 1.317367 1.077595
[1 rows x 25 columns]
univ_dist(join_df_P3,'brand')
brand mercedes-benz volkswagen bmw audi fiat mini peugeot citroen \ model 8099 5912 5552 4162 3563 3434 3369 3172 brand ford renault ... alfa-romeo seat mitsubishi abarth skoda \ model 2863 2749 ... 1356 1332 1319 1168 1158 brand porsche mazda land-rover subaru chevrolet model 1151 1148 987 961 956 [1 rows x 25 columns]
mercedes-benz volkswagen bmw audi fiat mini \
brand 11.160413 8.146729 7.65065 5.735231 4.90981 4.732048
peugeot citroen ford renault ... alfa-romeo seat \
brand 4.642478 4.371012 3.945211 3.788119 ... 1.868566 1.835494
mitsubishi abarth skoda porsche mazda land-rover \
brand 1.817581 1.609503 1.595723 1.586077 1.581943 1.360085
subaru chevrolet
brand 1.324257 1.317367
[1 rows x 25 columns]
univ_dist(df_RD,'model')
model 500 c-220 corsa cooper others golf doblo astra golf-variant \ model 6271 6071 6069 5972 5947 5811 5770 5757 5717 model fiesta ... 330 c-200 c5 octavia scenic e-220 jimny punto \ model 5583 ... 5521 5503 5486 5438 5435 5428 5375 5340 model ibiza 320 model 5338 5257 [1 rows x 25 columns]
500 c-220 corsa cooper others golf doblo \
model 0.384226 0.371972 0.371849 0.365906 0.364374 0.356042 0.353529
astra golf-variant fiesta ... 330 c-200 c5 \
model 0.352733 0.350282 0.342072 ... 0.338273 0.33717 0.336129
octavia scenic e-220 jimny punto ibiza 320
model 0.333188 0.333004 0.332575 0.329328 0.327183 0.327061 0.322098
[1 rows x 25 columns]
univ_dist(df_P1,'model')
model volkswagen_up ford_focus-c-max ford_transit-custom \ model 200 200 200 model ford_transit-courier ford_transit-connect ford_transit \ model 200 200 200 model ford_tourneo-custom ford_tourneo-courier ford_tourneo-connect \ model 200 200 200 model ford_tourneo ... ford_kuga ford_grand-c-max ford_galaxy \ model 200 ... 200 200 200 model ford_fusion honda_accord honda_civic honda_cr-v hyundai_i10 \ model 200 200 200 200 200 model hyundai_santa-fe hyundai_matrix model 200 200 [1 rows x 25 columns]
subaru_forester honda_fr-v audi_a5 bmw_218 peugeot_1007 \
model 0.15361 0.15361 0.15361 0.15361 0.15361
mercedes-benz_c-270 kia_niro volkswagen_t6-transporter \
model 0.15361 0.15361 0.15361
abarth_595-competizione citroen_c-crosser ... skoda_citigo \
model 0.15361 0.15361 ... 0.15361
peugeot_407 ds-automobiles_ds-5 mercedes-benz_sl-55-amg \
model 0.15361 0.15361 0.15361
nissan_nv400 nissan_almera-tino mercedes-benz_c-63-amg \
model 0.15361 0.15361 0.15361
volkswagen_tiguan isuzu_d-max porsche_panamera
model 0.15361 0.15361 0.15361
[1 rows x 25 columns]
univ_dist(join_df_P2,'model')
model chrysler_voyager mercedes-benz_b-170 mini_one audi_tt peugeot_207 \ model 200 199 199 199 199 model volkswagen_jetta kia_carens volkswagen_fox mitsubishi_colt \ model 199 199 198 198 model toyota_prius ... nissan_370z nissan_almera opel_agila opel_meriva \ model 198 ... 197 197 197 197 model peugeot_1007 peugeot_206 fiat_croma peugeot_307 ssangyong_korando \ model 197 197 197 197 197 model suzuki_sx4 model 197 [1 rows x 25 columns]
chrysler_voyager volkswagen_jetta mini_one audi_tt peugeot_207 \
model 0.2756 0.274222 0.274222 0.274222 0.274222
mercedes-benz_b-170 kia_carens volkswagen_fox toyota_prius \
model 0.274222 0.274222 0.272844 0.272844
mazda_3 ... peugeot_206 ssangyong_korando mercedes-benz_s-320 \
model 0.272844 ... 0.271466 0.271466 0.271466
opel_agila mercedes-benz_c-350 suzuki_sx4 nissan_370z \
model 0.271466 0.271466 0.271466 0.271466
nissan_almera mercedes-benz_200 fiat_croma
model 0.271466 0.271466 0.271466
[1 rows x 25 columns]
univ_dist(join_df_P3,'model')
model mercedes-benz_e_class audi_a_series mercedes-benz_c_class \ model 2509 1700 1659 model mini_cooper abarth_500_series bmw_300_series bmw_500_series \ model 1344 1168 1158 1152 model mercedes-benz_a_class bmw_100_series mini_cooper-countryman ... \ model 964 954 934 ... model volkswagen_passat audi_tt_series volkswagen_beetle audi_r_series \ model 579 579 577 575 model mercedes-benz_m_class mercedes-benz_sl_class mercedes-benz_s_class \ model 575 573 572 model bmw_700_series opel_van fiat_ducato model 562 485 475 [1 rows x 25 columns]
mercedes-benz_e_class audi_a_series mercedes-benz_c_class \
model 3.457399 2.342598 2.2861
mini_cooper abarth_500_series bmw_300_series bmw_500_series \
model 1.85203 1.609503 1.595723 1.587455
mercedes-benz_a_class bmw_100_series mini_cooper-countryman ... \
model 1.328391 1.314611 1.287051 ...
volkswagen_passat audi_tt_series volkswagen_beetle \
model 0.797861 0.797861 0.795105
mercedes-benz_m_class audi_r_series mercedes-benz_sl_class \
model 0.792349 0.792349 0.789593
mercedes-benz_s_class bmw_700_series opel_van fiat_ducato
model 0.788215 0.774435 0.668329 0.654549
[1 rows x 25 columns]
Top 19 most significant year with their absolute and relative frequency.
univ_dist(df_RD,'year')
year 2018 2017 2015 2016 2014 2013 2012 2011 2009 \ model 129294 125332 117000 115569 114626 105396 98205 95665 91686 year 2008 2010 2007 2006 2005 2004 2003 2002 2001 ceed model 90892 90692 85499 77928 76332 65332 55983 48257 46009 2416
2018 2017 2015 2016 2014 2013 2012 \
year 7.921878 7.679125 7.168621 7.080944 7.023166 6.457641 6.017047
2011 2009 2008 2010 2007 2006 2005 \
year 5.86142 5.617626 5.568977 5.556723 5.238547 4.774669 4.676882
2004 2003 2002 2001 ceed
year 4.002909 3.430093 2.956719 2.818984 0.148029
List extracted from all folders in the s3 bucket, in csv format.
This dataset contains information extracted from the bucket before it was downloaded, and was compared to a downloaded list.
It was found that there were empty folders with no images, some with size = 0 and some corrupted files that were checked at download time.
all_data = pd.read_csv('./list_s3_cars.csv')
all_data.head()
| Key | LastModified | ETag | Size | StorageClass | OwnerDisplayName | OwnerID | |
|---|---|---|---|---|---|---|---|
| 0 | eu-car-dataset/images/abarth_124-spider_2016/ | 1/1/0001 00:00:00 | NaN | 0 | STANDARD | unknown | unknown |
| 1 | eu-car-dataset/images/abarth_124-spider_2017/ | 1/1/0001 00:00:00 | NaN | 0 | STANDARD | unknown | unknown |
| 2 | eu-car-dataset/images/abarth_124-spider_2018/ | 1/1/0001 00:00:00 | NaN | 0 | STANDARD | unknown | unknown |
| 3 | eu-car-dataset/images/abarth_500_2008/ | 1/1/0001 00:00:00 | NaN | 0 | STANDARD | unknown | unknown |
| 4 | eu-car-dataset/images/abarth_500_2009/ | 1/1/0001 00:00:00 | NaN | 0 | STANDARD | unknown | unknown |
all_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6624 entries, 0 to 6623 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Key 6624 non-null object 1 LastModified 6624 non-null object 2 ETag 1 non-null object 3 Size 6624 non-null int64 4 StorageClass 6624 non-null object 5 OwnerDisplayName 6624 non-null object 6 OwnerID 6624 non-null object dtypes: int64(1), object(6) memory usage: 362.4+ KB
all_data = pd.read_csv('./list_s3_cars.csv')
filter_data_missing = all_data[6462:]
filter_data_missing.Key.describe()
count 162 unique 162 top eu-car-dataset/images/volvo_xc70_2004/ freq 1 Name: Key, dtype: object
In the search for the best cleaning model for the preprocessing, we created our own sequential model with keras and tested other architectures and techniques to achieve better performance, models such as yolo5, yolo7, Mobilenet, Mobilenet_v2, Mobilenet_v3, Densenet. Project
profile_name = 'Raw Data'
profile = ProfileReport(df_RD,
title="TEAM 1 : Raw Data "+str(datetime.datetime.today().strftime('%Y-%m-%d')))
print('---Saving profile...')
# Save profile
profile.to_file(output_file=os.getcwd()+'/'+profile_name+str(datetime.datetime.today().strftime('%Y-%m-%d')))
---Saving profile...
profile.to_notebook_iframe()
profile_name_ = 'Pre-processed 1'
profile_ = ProfileReport(df_P1,
title="TEAM 1 : Pre-processed 1 "+str(datetime.datetime.today().strftime('%Y-%m-%d')))
print('---Saving profile...')
# Save profile
profile_.to_file(output_file=os.getcwd()+'/'+profile_name_+str(datetime.datetime.today().strftime('%Y-%m-%d')))
---Saving profile...
profile_.to_notebook_iframe()
profile_name__ = 'Pre-processed 2'
profile__ = ProfileReport(join_df_P2,
title="TEAM 1 : Pre-processed 2 "+str(datetime.datetime.today().strftime('%Y-%m-%d')))
print('---Saving profile...')
# Save profile
profile__.to_file(output_file=os.getcwd()+'/'+profile_name__+str(datetime.datetime.today().strftime('%Y-%m-%d')))
---Saving profile...
profile__.to_notebook_iframe()
profile_name___ = 'Pre-processed 3'
profile___ = ProfileReport(join_df_P3,
title="TEAM 1 : Pre-processed 3 "+str(datetime.datetime.today().strftime('%Y-%m-%d')))
print('---Saving profile...')
# Save profile
profile___.to_file(output_file=os.getcwd()+'/'+profile_name___+str(datetime.datetime.today().strftime('%Y-%m-%d')))
---Saving profile...
profile___.to_notebook_iframe()

Carlos
The graphs show an improvement with data processing by filtering quality data, this will give us a good model and improve our results.Nico
The quality of the source data, establish the need to make some decisions, the reduction of classes is one of them, in addition to balancing the classes establishing a total number of images for each one of them.Enzo
To better balance the data we could use clustering and improve the data if we look for more images.Jhoel
Corrupted data and missing images should be discarded to avoid bias.Agus
There are images that cannot be opened, with a script we could delete them to avoid time reducing the size of the data.Nahu
To balance the data we would have to think of a criterion and filter out everything that is not a car, it will be necessary to use several models.